# To build the container (Singularity version >=2.6):
# sudo singularity build CRISPRCasFinder Singularity

BootStrap: docker
from: ubuntu:xenial

%labels
	MAINTAINER Bertrand Neron <bneron@pasteur.fr>
	AUTHOR Couvin David, Bernheim Aude, Toffano-Nioche Claire, Touchon Marie, Michalik Juraj, Neron Bertrand, Rocha Eduardo, Vergnaud Gilles, Gautheret Daniel, Pourcel Christine.
    CRISPRCasFinder.version 4.2.18

%help
    Name:
      CRISPRCasFinder standalone version 4.2.18

    Synopsis:
      A perl script to identify CRISPR arrays and associated Cas genes in DNA sequences

    Usage:
      ./CRISPRCasFinder.img <filename.fasta>
      OR
      singularity run CRISPRCasFinder.img [options] -in <filename.fasta>

      --Please note <filename.fasta> must be in Fasta format. Please also note that when several options are called, the option "-in or -i" must precede the input FASTA file.

    General:
      -help or -h           This help
      -version or -v        The current version of the program will be displayed

    Other options:

      [Input/Output and -so]
      -in or -i [XXX]       Input Fasta file (with extensions: .fasta, .fna, .mfa, .fa)
      -outdir or -out [XXX] Output directory (if users do not use this option, a delault directory will be created wit the date and time)
      -keepAll or -keep     Option allowing to keep secondary folders/files (Prodigal/Prokka, CasFinder, rawFASTA, Properties); (default: 0)
      -LOG or -log          Option allowing to write LOG files (default: 0)
      -HTML or -html        Option allowing to display results as a static HTML web page (default value: 0). The web page created (index.html) will be dependent of a CSS file (crispr.css)
      -copyCSS [XXX]        Option allowing to copy provided CSS file into "Visualization" repository if option -HTML is set (default: '/usr/local/share/CRISPRCasFinder/crispr.css')
      -soFile or -so [XXX]  Option allowing to use the shared object file if it is not present in current directory (default: '/.singularity.d/libs/sel392v2.so')

     [Detection of CRISPR arrays]
      -mismDRs or -md [XXX] Percentage mismatchs allowed between DRs (default: 20)
      -truncDR or -t [XXX]  Percentage mismatchs allowed for truncated DR (default: 33.3)
      -minDR or -mr [XXX]   Minimal size of DRs (default: 23)
      -maxDR or -xr [XXX]   Maximal size of DRs (default: 55)
      -minSP or -ms [XXX]   Minimal size of Spacers (default: 25)
      -maxSP or -xs [XXX]   Maximal size of Spacers (default: 60)
      -noMism or -n Option used to do not allow mismatches (default value is 1 when this option is not called. i.e. mismatches are allowed by default)
      -percSPmin or -pm [XXX]       Minimal Spacers size in function of DR size (default: 0.6)
      -percSPmax or -px [XXX]       Maximal Spacers size in function of DR size (default: 2.5)
      -spSim or -s [XXX]    Maximal allowed percentage of similarity between Spacers (default: 60)
      -DBcrispr or -dbc [XXX]       Option allowing to use a CSV file of all CRISPR candidates contained in CRISPRdb (from last update) (default: '/usr/local/share/CRISPRCasFinder/CRISPR_crisprdb.csv')
      -repeats or -rpts [XXX]       Option allowing to use a consensus repeats list generated by CRISPRdb (default: '/usr/local/share/CRISPRCasFinder/Repeat_List.csv')
      -DIRrepeat or -drpt [XXX]     Option allowing to use a file file containing repeat IDs and orientation according to CRISPRDirection (default: '/usr/local/share/CRISPRCasFinder/repeatDirection.tsv')
      -flank or -fl [XXX]   Option allowing to set size of flanking regions in base pairs (bp) for each analyzed CRISPR array (default: 100)
      -levelMin or -lMin [XXX]      Option allowing to choose the minimum evidence-level corresponding to CRISPR arrays we want to display (default:1)

      [Detection of Cas clusters]
      -cas or -cs   Search corresponding Cas genes using Prokka (default kingdom: "Bacteria") and MacSyFinder (default: 0)
      -ccvRep or -ccvr      Option used to write the CRISPR-Cas vicinity report (CRISPRs and Cas) if option -cas is set (default: 0)
      -vicinity or -vi [XXX]        Option used to define number of nucleotides separating a CRISPR array from its neighboring Cas system (default: 600)
      -CASFinder or -cf [XXX]       Option allowing to use a custom CasFinder instead of using the CasFinder provided by Institut Pasteur  (default: '/usr/local/share/macsyfinder/CasFinder-2.0')
      -cpuMacSyFinder or -cpuM [XXX]        Option allowing to set number of CPUs to use for MacSyFinder (default: 1)
      -rcfowce      Option allowing to run Casfinder only when any CRISPR exists (default: 0) (set if -cas is set)
      -definition or -def [XXX]     Option allowing to specify CasFinder definition (if option -cas is set) to be more or less stringent (allowed values: 'General', 'Typing' or 'SubTyping'; default: 'General')
      -gffAnnot or -gff [XXX]       Option allowing user to provide an annotation GFF file (if options -cas and -faa are set) (default: '')
      -proteome or -faa [XXX]       Option allowing user to provide a proteome file '.faa' (if options -cas and -gff are set) (default: '')
      -cluster or -ccc [XXX]        Option allowing to constitute clusters or groups of CRISPR or Cas systemes given a determined threshold e.g. 20kb (default: 0) (set if -cas is set)
      -getSummaryCasfinder or -gscf Option allowing to get summary file of Cas-finder (MacSyFinder) and copy it to TSV repository (default: 0)
      -geneticCode or -gcode [XXX]  Option allowing to modify the genetic code (translation table) for CDS annotation (default: 11)

      [Use Prokka instead of Prodigal (default option)]
      -useProkka or -prokka Option allowing to use Prokka instead of Prodigal (default: 0)
      -cpuProkka or -cpuP [XXX]     Option allowing to set number of CPUs to use for Prokka (default: 1)
      -metagenome or -meta  Option allowing to better analyze metagenome with Prokka (default: )
      -ArchaCas or -ac      same option as -cas using "Archaea" as default kingdom instead of "Bacteria" (default: 0). Option to be used when -prokka is used.

    Options waiting for a given parameter (filename, text, or number) are followed by symbols "[XXX]". Other options could be considered as booleans (yes or no, 1 or 0).

    #####################################################

    The input file should meet these constraints:

    - the file name must not contain multiple dots (an acceptable file name is e.g. "multifasta.fna")
    - the sequence must be identified/named (the ID follows character ">", and a description could be added after a space character),
    - the ID should not contain special characters such as "|$%" or multiple dots,
    - the file must contain nucleotides (not amino acids),
    - the file could contain several sequences in FASTA format,
    - each ID must be unique,
    - the ID and the file name must not be too long,
    - the ID will be used for output.

    Examples:
    (1): ./CRISPRCasFinder.simg test.fasta
    In this example, your result folder will be in the directory named: "Result_test"

    (2): ./CRISPRCasFinder -in test.fasta -md 20 -t 33.3 -mr 23 -xr 55 -ms 25 -xs 60 -pm 0.6 -px 2.5 -s 60

    (3): ./CRISPRCasFinder -in genomes100.fna -drpt my_repeatDirection.tsv -rpts my_Repeat_List.csv -cs -fr -dbc my_CRISPR_crisprdb.csv -html

    (4): ./CRISPRCasFinder -in metagenome.fna -rcfowce -prokka -log -out Results_metagenome -cpuProkka 8 -cpuMacSyFinder 8 -meta

    (5): ./CRISPRCasFinder -in sequence.fasta -cas -log -out RES_Sequence -def G -force

%environment
    LC_ALL='C'
    export LC_ALL

%setup
    mkdir -p ${SINGULARITY_ROOTFS}/usr/local/src/CRISPRCasFinder
    cp CRISPRCasFinder.singularity.patch ${SINGULARITY_ROOTFS}/usr/local/src/CRISPRCasFinder/

%post
	export DEBIAN_FRONTEND=noninteractive
	apt-get update
	apt-get install -y apt-utils zlib1g-dev make gcc
	# dash is too restricted
	ln -nsf /bin/bash /bin/sh
    # to be runnable on tars @ Institut Pasteur
    mkdir /pasteur

    apt-get update -y
    apt-get install -y curl default-jre python perl parallel cpanminus patch wget unzip

    ###################
    # Bioinfo package #
    ###################
    apt-get install -y \
    hmmer \
    emboss emboss-lib \
    ncbi-blast+ \
    bioperl \
    bioperl-run \
    libdatetime-perl \
    libxml-simple-perl \
    libdigest-md5-perl \
    clustalw \
    muscle \
    prodigal \
    aragorn \
    infernal \

    cd /usr/bin
    ln -s clustalw2 clustalw2
    cd /

    cpanm Try::Tiny
    cpanm Test::Most
    cpanm JSON::Parse
    cpanm Date::Calc
    cpanm Class::Struct
    cpanm Bio::DB::Fasta
    cpanm File::Copy
    cpanm Bio::Seq Bio::SeqIO
    cpanm --force Bio::Tools::Run::Alignment::Clustalw
    cpanm --force Bio::Tools::Run::Alignment::Muscle

    prefix="/usr/local"

    ##########
    # vmatch #
    ##########
    PN="vmatch"
    PV="2.3.0"
    P="${PN}-${PV}"
    P_SRC=${prefix}/src/${PN}

    mkdir -p ${prefix}/src/vmatch
    cd ${prefix}/src/vmatch
    distribution='Linux_x86_64'
    vmatch="${PN}-${PV}-${distribution}-64bit"
    vmatch_url="http://vmatch.de/distributions/${vmatch}.tar.gz"
    curl -L -O --silent "${vmatch_url}"
    tar -zxf ${vmatch}.tar.gz
    cd ${vmatch}
    gcc -Wall -Werror -fPIC -O3 -shared SELECT/sel392.c -m64 -o sel392v2.so
    # copy the shared library in LD_LIBRARY_PATH
    install -m 0775 sel392v2.so /.singularity.d/libs/sel392v2.so
    cd /.singularity.d/libs/
    ln -s sel392v2.so sel392.so
    cd ${prefix}/src/${PN}/${vmatch}
    install -m 0775 vmatch ${prefix}/bin/vmatch2
    install -m 0775 vsubseqselect ${prefix}/bin/vsubseqselect2
    install -m 0775 mkvtree ${prefix}/bin/mkvtree2
    cd /

    ###############
    # macsyfinder #
    ###############
    PN="macsyfinder"
    PV="1.0.5"
    P="${PN}-${PV}"
    P_SRC=${prefix}/src/${PN}

    mkdir -p ${prefix}/src/${PN}
    cd ${prefix}/src/${PN}
    macsyfinder_url="https://dl.bintray.com/gem-pasteur/MacSyFinder/${P}.tar.gz"
    curl -L -O --silent "${macsyfinder_url}"
    tar -xzf ${P}.tar.gz
    cd ${P}
    python setup.py build
    python setup.py install
    cd /

    #######################
    # prokka dependencies #
    #######################

    ###########
    # signalp #
    ###########

    # Cannot be installed due to Licensing problem.

    ###########
    # tbl2asn #
    ###########
    # trusty package ncbi-tools-bin provide a too old tbl2asn
    PN="tbl2asn"
    PV="1.12"
    P="${PN}-${PV}"
    P_SRC=${prefix}/src/${PN}

    mkdir -p ${P_SRC}
    cd ${prefix}/src/tbl2asn
    tbl2asn_url="ftp://ftp.ncbi.nih.gov/toolbox/ncbi_tools/converters/by_program/${PN}/linux64.${PN}.gz"
    wget "${tbl2asn_url}"
    gunzip linux64.tbl2asn.gz
    install -m 0755 linux64.tbl2asn ${prefix}/bin/${PN}

    ##########
    # prokka #
    ##########
    PN="prokka"
    PV="1.12"
    P="${PN}-${PV}"
    P_SRC=${prefix}/src/${PN}

    mkdir -p ${P_SRC}
    cd ${P_SRC}

    prokka_url="http://www.vicbioinformatics.com/${P}.tar.gz"
    curl -L -O --silent "${prokka_url}"
    tar -xzf ${P}.tar.gz
    cd ${P}

    prokka_data=${prefix}/share/${PN}
    prokka_db=${prokka_data}/db
    test -d ${prokka_db} || mkdir -p ${prokka_db}
    # copy database
    cp -pr db/* ${prokka_db}

    # tell prokka where to find its tools and db once installed
    sed -i -e "s|my \$BINDIR.*|my \$BINDIR=\"${prefix}/libexec/prokka\";|" \
           -e "s|my \$DBDIR.*|my \$DBDIR=\"${prokka_db}\";|" \
           bin/prokka

    for bin in bin/*;
    do
        install -m 0755 ${bin} ${prefix}/bin/
    done

    # install prokka binaries
    test -d ${prefix}/libexec/${PN} || mkdir -p ${prefix}/libexec/${PN}

    for p in binaries/linux/*;
    do
        install -m 0755 ${p} ${prefix}/libexec/${PN}
    done
    # parallel is installed via packet manager
    install -m 0755 binaries/common/minced ${prefix}/libexec/${PN}/
    install -m 0644 binaries/common/minced.jar ${prefix}/libexec/${PN}/

    # setup prokka db
    prokka_cmd="${prefix}/bin/${PN}"

    ${prokka_cmd} --setupdb
    cd /

    ###################
    # CRISPRCasFinder #
    ###################
    PN="CRISPRCasFinder"
    PV="4.2.18"
    P="${PN}-${PV}"

    test -d "${prefix}/src/${PN}" || mkdir -p "${prefix}/src/${PN}"
    cd "${prefix}/src/${PN}"

    cripsr_cas_url="https://github.com/dcouvin/${PN}/archive/master.zip"
    curl -L -o "${PN}.zip" --silent "${cripsr_cas_url}"

    unzip "${PN}.zip"
    mv "${PN}-master" "${PN}"

    cd "${PN}"
    crispr_data="${prefix}/share/${PN}"
    test -d "${crispr_data}" || mkdir "${crispr_data}"

    patch CRISPRCasFinder.pl CRISPRCasFinder.patch
    patch CRISPRCasFinder.pl ../CRISPRCasFinder.singularity.patch

    install -m 0755 CRISPRCasFinder.pl ${prefix}/bin/CRISPRCasFinder
    install -m 0644 supplementary_files/crispr.css ${crispr_data}
    install -m 0644 supplementary_files/Repeat_List.csv ${crispr_data}
    install -m 0644 supplementary_files/CRISPR_crisprdb.csv ${crispr_data}
    install -m 0644 supplementary_files/repeatDirection.tsv ${crispr_data}

    #############
    # CasFinder #
    #############
    # use the CasFinder distributed with CRISPRCasFinder
    cas_data="${prefix}/share/macsyfinder/"
    # remove profiles and definitions packaged with macsyfinder
    rm -Rf "${cas_data}DEF"
    rm -Rf "${cas_data}profiles"
    # install cas profiles and definition packaged with CRISPRCasFinder
    cp -r CasFinder-2.0.2 ${cas_data}
    cd /


%test
    stamp=$(date '+%Y-%m-%d-%H:%M:%S')
    result_dir="/tmp/test_CRISPRCasFinder_${stamp}"
    prefix=/usr/local
    crispr_cas_src="${prefix}/src/CRISPRCasFinder/CRISPRCasFinder/"

    CRISPRCasFinder -def General -cas -i "${crispr_cas_src}/install_test/sequence.fasta" -out "${result_dir}" -keep

    returncode=$?
    if [ ${returncode} -ne 0 ];
    then
        echo "Test failed see ${result_dir} for details."
        exit ${returncode}
    fi

    for f in "Cas_REPORT.tsv" "Crisprs_REPORT.tsv";
    do
        diff "${crispr_cas_src}/install_test/${f}" "${result_dir}/TSV/${f}"
        returncode=$?
        if [ ${returncode} -ne 0 ];
        then
            echo "Test failed see ${result_dir} for details."
            exit ${returncode}
        fi
    done

    rm -Rf "${result_dir}"
    exit 0

%runscript
    exec /usr/local/bin/CRISPRCasFinder "$@"

